A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P, if its performance on T, as measured by P, improves with E.
from gapminder import gapminder
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import math
import matplotlib.pylab as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')gapminder["logGDP"] = np.log(gapminder["gdpPercap"])
gapminder[["logGDP", "lifeExp"]].head() logGDP lifeExp
0 6.658583 28.801
1 6.710344 30.332
2 6.748878 31.997
3 6.728864 34.020
4 6.606625 36.088
reg_plot = sns.regplot(x = gapminder["logGDP"], y = gapminder["lifeExp"],
scatter_kws={"color": "black"}, line_kws={"color": "red"})
reg_plotlm = smf.ols(formula='lifeExp~logGDP', data = gapminder).fit()
print(lm.summary()) OLS Regression Results
==============================================================================
Dep. Variable: lifeExp R-squared: 0.652
Model: OLS Adj. R-squared: 0.652
Method: Least Squares F-statistic: 3192.
Date: Sun, 22 May 2022 Prob (F-statistic): 0.00
Time: 16:49:58 Log-Likelihood: -5877.2
No. Observations: 1704 AIC: 1.176e+04
Df Residuals: 1702 BIC: 1.177e+04
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -9.1009 1.228 -7.413 0.000 -11.509 -6.693
logGDP 8.4051 0.149 56.500 0.000 8.113 8.697
==============================================================================
Omnibus: 148.382 Durbin-Watson: 0.398
Prob(Omnibus): 0.000 Jarque-Bera (JB): 205.732
Skew: -0.698 Prob(JB): 2.12e-45
Kurtosis: 3.973 Cond. No. 55.7
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
gapminder['predictedLifeExp'] = lm.predict()
fig, ax = plt.subplots(figsize=(7.5, 5))
ax.scatter(x = gapminder["lifeExp"], y = gapminder["predictedLifeExp"], color = "black")
ax.plot([30, 90], [30, 90], color = "red")
ax.set_xlabel("Expected")
ax.set_ylabel("Predicted")lm2 = smf.ols(formula='lifeExp~logGDP+year', data = gapminder).fit()
print(lm2.summary()) OLS Regression Results
==============================================================================
Dep. Variable: lifeExp R-squared: 0.717
Model: OLS Adj. R-squared: 0.717
Method: Least Squares F-statistic: 2153.
Date: Sun, 22 May 2022 Prob (F-statistic): 0.00
Time: 16:50:00 Log-Likelihood: -5702.1
No. Observations: 1704 AIC: 1.141e+04
Df Residuals: 1701 BIC: 1.143e+04
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -391.0514 19.418 -20.138 0.000 -429.138 -352.965
logGDP 7.7703 0.138 56.273 0.000 7.499 8.041
year 0.1956 0.010 19.702 0.000 0.176 0.215
==============================================================================
Omnibus: 149.973 Durbin-Watson: 0.326
Prob(Omnibus): 0.000 Jarque-Bera (JB): 210.183
Skew: -0.699 Prob(JB): 2.29e-46
Kurtosis: 4.003 Cond. No. 2.31e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.31e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
gapminder['predictedLifeExp2'] = lm2.predict()
fig, ax = plt.subplots(figsize=(7.5, 5))
ax.scatter(x = gapminder["lifeExp"], y = gapminder["predictedLifeExp2"], color = "black")
ax.plot([30, 90], [30, 90], color = "red")
ax.set_xlabel("Expected")
ax.set_ylabel("Predicted")from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.spatial.distance import cdist, pdist
import matplotlib.cm as cmgapminder["logPop"] = np.log(gapminder["pop"])
gap_07 = gapminder[gapminder["year"] == 2007]
fig, ax = plt.subplots(figsize=(7.5, 5))
ax.scatter(x = gap_07["lifeExp"], y = gap_07["logPop"], color = "black")
ax.set_xlabel("Life Expectancy")
ax.set_ylabel("Population (log)")Optimal Clusters
#Silhouette Score
def silhouette_score_plot(k, data):
km = KMeans(n_clusters = k, random_state = 42)
cluster_labels = km.fit_predict(data)
silhouette_avg = silhouette_score(data, cluster_labels)
return silhouette_avg
scores = pd.DataFrame({"k": [],
"score": []})
for n in range(2, 11):
score = silhouette_score_plot(n , gap_07[["lifeExp", "logPop", "gdpPercap"]])
scores = scores.append(pd.DataFrame({"k": [n], "score": [score]}))
fig, ax = plt.subplots(figsize=(7.5, 5))
ax.plot(scores["k"], scores["score"])
ax.set_xlabel("K")
ax.set_ylabel("Silhouette Coefficient")
ax.set_title("K with max coefficient = " + str(int(scores[scores["score"] == scores["score"].max()]["k"].squeeze())))#Elbow Method
sse = pd.DataFrame({"k": [],
"sse": []})
for n in range(1, 11):
km = KMeans(n_clusters = n, random_state = 42).fit(gap_07[["lifeExp", "logPop", "gdpPercap"]])
sse = sse.append(pd.DataFrame({"k": [n], "sse": [km.inertia_]}))
# Inertia = sum of distances of samples to their closest cluster center
fig, ax = plt.subplots(figsize=(7.5, 5))
ax.plot(sse["k"], sse["sse"])
ax.set_xlabel("K")
ax.set_ylabel("SSE")
ax.set_title("Elbow for KMeans Clustering")Clustering
km = KMeans(random_state = 42, n_clusters = 2)
result = km.fit(gap_07[["lifeExp", "logPop", "gdpPercap"]])
gap_07["cluster"] = result.labels_
fig, ax = plt.subplots(figsize=(7.5, 5))
ax.scatter(x = gap_07["gdpPercap"], y = gap_07["lifeExp"], c = gap_07["cluster"])
ax.set_xlabel("GDP Per Capita")
ax.set_ylabel("Life Expectancy")fig, ax = plt.subplots(figsize=(7.5, 5))
ax.scatter(x = gap_07["logPop"], y = gap_07["lifeExp"], c = gap_07["cluster"])
ax.set_xlabel("Population (log)")
ax.set_ylabel("Life Expectancy")gap_07.groupby("cluster").agg({"lifeExp":"mean",
"logPop":"mean",
"gdpPercap":"mean"}) lifeExp logPop gdpPercap
cluster
0 79.171486 16.207095 31904.371711
1 63.028523 16.298523 5064.646622